library(chemhelper) #for sim_*(), plot_pca(), plot_plsda(), get_VIP(), and get_loadings()
library(tidyverse)
library(ropls) #for opls(), which does pca and pls-da
library(cowplot) #for plot_grid()
theme_set(theme_gray())
library(iheatmapr) #for correlation heatmaps
library(here)
Read in datasets
I’ll read in the datasets from “Generating Datasets.Rmd” and pick individual datasets to make figures
null.list <- read_rds(here("data", "null.rds"))
control.list <- read_rds(here("data", "control.rds"))
needle.list <- read_rds(here("data", "needle.rds"))
set.seed(999)
null <- null.list[[sample(1:100, 1)]]
control <- control.list[[sample(1:100, 1)]]
needle <- needle.list[[sample(1:100, 1)]]
Null
The code chunk below creates a data frame with a grouping variable with two groups (a and b) with n=10 per group, 5 variables that don’t covary, and two groups of 10 variables that moderately covary.
# # myseed <- runif(1, 0, 1000)
# myseed <- 401.6368
#
# df <- data_frame(group = rep(c("a", "b"), each = 10))
# no_discr <- df %>%
# sim_covar(5, var = 1, cov = 0, name = "uncorr", seed = myseed) %>%
# sim_covar(10, var = 1, cov = 0.5, name = "corrA", seed = myseed + 1) %>%
# sim_covar(10, var = 1, cov = 0.5, name = "corrB", seed = myseed + 2)
This heatmap shows the correlation structure.
# no_discr %>%
null %>%
select(-group) %>%
cor() %>%
iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
null.cor <-
# no_discr %>%
null %>%
select(-group) %>%
cor() %>%
as_data_frame(rownames = "row") %>%
gather(-row, key = col, value = cor)
# cor1 <-
null.cor.p <-
ggplot(null.cor, aes(x = col, y = row, fill = cor)) +
geom_tile() +
scale_fill_gradient2("Correlation Coeficient") +
labs(x = NULL, y = NULL) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90), legend.position = "top")
Then I perform PCA and PLS-DA on this dataset, forcing both models to have two axes (for plotting). The PCA score plot shows no separation and the first two axes explain more than half of the variation in the data. The PLS-DA score plot shows slight separation of the groups (even though there is none). It’s a pretty terrible model with a negative Q2 value and a low R2 value.
null.pca.p <- plot_pca(null.pca, null$group, annotate = "subtitle")
null.pls.p <- plot_plsda(null.pls, annotate = "subtitle")
plot_grid(null.cor.p,
null.pca.p + theme(legend.position = "NULL"),
null.pls.p + theme(legend.position = "NULL"), nrow = 1, ncol = 3)

Needle in a haystack
Keeping the total number of observations and variables the same, and using the same random seed, I create a second data set with 5 variables that do not covary within groups, but differ in their means between groups.
# discr <- df %>%
# sim_covar(10, var = 1, cov = 0.55, name = "corrA", seed = myseed + 1) %>%
# sim_covar(10, var = 1, cov = 0.55, name = "corrB", seed = myseed + 2) %>%
# sim_discr(5, var = 1, cov = 0, group_means = c(-1, 1), name = "discr", seed = myseed + 3)
This shows the correlation structure of the data. Because the discriminating variables have different means in the two groups, they are correlated about as strongly as the covarying variables.
needle %>%
# discr %>%
select(-group) %>%
cor() %>%
iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
needle.cor <-
needle %>%
# discr %>%
select(-group) %>%
cor() %>%
as_data_frame(rownames = "row") %>%
gather(-row, key = col, value = cor)
needle.cor.p <-
ggplot(needle.cor, aes(x = col, y = row, fill = cor)) +
geom_tile() +
scale_fill_gradient2("Correlation Coeficient") +
labs(x = NULL, y = NULL) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90), legend.position = "top")
# needle.cor.p
The PCA score plot still shows no real separation because the discriminating variables do not contribute much to the total covariation in the data. The PLS-DA plot shows very strong separation, despite the differences between the groups being only due to 5 out of 25 variables. The PLS-DA model performs well, with high R2 and Q2 values
needle.pca.p <- plot_pca(needle.pca, needle$group, annotate = "subtitle")
needle.pls.p <- plot_plsda(needle.pls, annotate = "subtitle")
plot_grid(needle.cor.p,
needle.pca.p + theme(legend.position = "NULL"),
needle.pls.p + theme(legend.position = "NULL"),
nrow = 1, ncol = 3)

Control
control.cor <-
control %>%
select(-group) %>%
cor() %>%
as_data_frame(rownames = "row") %>%
gather(-row, key = col, value = cor)
control.cor.p <-
ggplot(control.cor, aes(x = col, y = row, fill = cor)) +
geom_tile() +
scale_fill_gradient2("Correlation Coeficient") +
labs(x = NULL, y = NULL) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90), legend.position = "top")
control.pca.p <- plot_pca(control.pca, control$group, annotate = "subtitle")
control.pls.p <- plot_plsda(control.pls, annotate = "subtitle")
plot_grid(control.cor.p,
control.pca.p + theme(legend.position = "NULL"),
control.pls.p + theme(legend.position = "NULL"),
nrow = 1, ncol = 3)

Save plots and tables
fig1_new <-
plot_grid(null.cor.p,
null.pca.p + theme(legend.position = "none") + labs(title = ""),
null.pls.p + theme(legend.position = "none") + labs(title = ""),
control.cor.p,
control.pca.p + theme(legend.position = "none") + labs(title = ""),
control.pls.p + theme(legend.position = "none") + labs(title = ""),
needle.cor.p,
needle.pca.p + theme(legend.position = "none") + labs(title = ""),
needle.pls.p + theme(legend.position = "none") + labs(title = ""),
ncol = 3, nrow = 3, labels = "AUTO")
save_plot(here("figs", "fig1_new.png"), fig1_new, ncol = 3, nrow = 3)
old, not run
VIP scores from the PLS-DA model identify all 5 discriminating variables as being important for separating the groups (VIP > 1). Using the VIP > 1 cutoff, it also spurriously identifies two of the other variables as being important. Discriminating variables are not strongly correlated with the first two axes of the PCA.
Red Herring
Large variation with weak discrimination plus small variation with large discrimination
---
title: "PCA vs. PLS with no and few discriminating variables"
author: "Eric R. Scott"
output: html_notebook
---
```{r message=FALSE, warning=FALSE}
library(chemhelper) #for sim_*(), plot_pca(), plot_plsda(), get_VIP(), and get_loadings()
library(tidyverse)
library(ropls) #for opls(), which does pca and pls-da
library(cowplot) #for plot_grid()
theme_set(theme_gray())
library(iheatmapr) #for correlation heatmaps
library(here)
```

# Read in datasets
I'll read in the datasets from "Generating Datasets.Rmd" and pick individual datasets to make figures

```{r}
null.list <- read_rds(here("data", "null.rds"))
control.list <- read_rds(here("data", "control.rds"))
needle.list <- read_rds(here("data", "needle.rds"))
```

```{r}
set.seed(999)
null <- null.list[[sample(1:100, 1)]]
control <- control.list[[sample(1:100, 1)]]
needle <- needle.list[[sample(1:100, 1)]]
```

# Null

The code chunk below creates a data frame with a grouping variable with two groups (a and b) with n=10 per group, 5 variables that don't covary, and two groups of 10 variables that moderately covary.

```{r}
# # myseed <- runif(1, 0, 1000)
# myseed <- 401.6368
# 
# df <- data_frame(group = rep(c("a", "b"), each = 10))
# no_discr <- df %>%
#   sim_covar(5, var = 1, cov = 0, name = "uncorr", seed = myseed) %>% 
#   sim_covar(10, var = 1, cov = 0.5, name = "corrA", seed = myseed + 1) %>% 
#   sim_covar(10, var = 1, cov = 0.5, name = "corrB", seed = myseed + 2)
```

This heatmap shows the correlation structure.

```{r}
# no_discr %>% 
null %>%   
  select(-group) %>% 
  cor() %>% 
  iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
```
```{r}
null.cor <-
  # no_discr %>% 
  null %>% 
  select(-group) %>% 
  cor() %>%
  as_data_frame(rownames = "row") %>% 
  gather(-row, key = col, value = cor)

# cor1 <-
null.cor.p <-
  ggplot(null.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top")
```

Then I perform PCA and PLS-DA on this dataset, forcing both models to have two axes (for plotting).  The PCA score plot shows no separation and the first two axes explain more than half of the variation in the data.  The PLS-DA score plot shows slight separation of the groups (even though there is none).  It's a pretty terrible model with a negative Q^2^ value and a low R^2^ value.

```{r include=FALSE}
null.pca <- opls(select(null, -group), plot = FALSE)
null.pls <- opls(select(null, -group), null$group, plot = FALSE, predI = 2, permI = 500)
```

```{r message=FALSE, warning=FALSE}
null.pca.p <- plot_pca(null.pca, null$group, annotate = "subtitle")
null.pls.p <- plot_plsda(null.pls, annotate = "subtitle")
plot_grid(null.cor.p,
          null.pca.p + theme(legend.position = "NULL"),
          null.pls.p + theme(legend.position = "NULL"), nrow = 1, ncol = 3)
```

# Needle in a haystack

Keeping the total number of observations and variables the same, and using the same random seed, I create a second data set with 5 variables that do not covary within groups, but differ in their means between groups.

```{r}
# discr <- df %>% 
#   sim_covar(10, var = 1, cov = 0.55, name = "corrA", seed = myseed + 1) %>% 
#   sim_covar(10, var = 1, cov = 0.55, name = "corrB", seed = myseed + 2) %>% 
#   sim_discr(5, var = 1, cov = 0, group_means = c(-1, 1), name = "discr", seed = myseed + 3)
```

This shows the correlation structure of the data.  Because the discriminating variables have different means in the two groups, they are correlated about as strongly as the covarying variables.

```{r}
needle %>% 
# discr %>% 
  select(-group) %>% 
  cor() %>% 
  iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
```
```{r}
needle.cor <-
  needle %>% 
  # discr %>% 
  select(-group) %>% 
  cor() %>%
  as_data_frame(rownames = "row") %>%  
  gather(-row, key = col, value = cor)

needle.cor.p <-
  ggplot(needle.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top")
# needle.cor.p
```
The PCA score plot still shows no real separation because the discriminating variables do not contribute much to the total covariation in the data.  The PLS-DA plot shows very strong separation, despite the differences between the groups being only due to 5 out of 25 variables.  The PLS-DA model performs well, with high R^2^ and Q^2^ values

```{r include=FALSE}
needle.pca <- opls(select(needle, -group), plot = FALSE)
needle.pls <- opls(select(needle, -group), needle$group, plot = FALSE, predI = 2, permI = 500)
```

```{r message=FALSE, warning=FALSE}
needle.pca.p <- plot_pca(needle.pca, needle$group, annotate = "subtitle")
needle.pls.p <- plot_plsda(needle.pls, annotate = "subtitle")
plot_grid(needle.cor.p,
          needle.pca.p + theme(legend.position = "NULL"),
          needle.pls.p + theme(legend.position = "NULL"),
          nrow = 1, ncol = 3)
```

# Control
```{r}
control.cor <-
  control %>% 
  select(-group) %>% 
  cor() %>%
  as_data_frame(rownames = "row") %>% 
  gather(-row, key = col, value = cor)

control.cor.p <-
  ggplot(control.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top")
```

```{r include=FALSE}
control.pca <- opls(select(control, -group), plot = FALSE)
control.pls <- opls(select(control, -group), control$group, plot = FALSE, predI = 2, permI = 500)
```


```{r message=FALSE, warning=FALSE}
control.pca.p <- plot_pca(control.pca, control$group, annotate = "subtitle")
control.pls.p <- plot_plsda(control.pls, annotate = "subtitle")
plot_grid(control.cor.p,
          control.pca.p + theme(legend.position = "NULL"),
          control.pls.p + theme(legend.position = "NULL"),
          nrow = 1, ncol = 3)
```

# Save plots and tables

```{r}
fig1_new <-
  plot_grid(null.cor.p,
            null.pca.p + theme(legend.position = "none") + labs(title = ""),
            null.pls.p + theme(legend.position = "none") + labs(title = ""),
            control.cor.p,
            control.pca.p + theme(legend.position = "none") + labs(title = ""),
            control.pls.p + theme(legend.position = "none") + labs(title = ""),
            needle.cor.p,
            needle.pca.p + theme(legend.position = "none") + labs(title = ""),
            needle.pls.p + theme(legend.position = "none") + labs(title = ""),
            ncol = 3, nrow = 3, labels = "AUTO")

save_plot(here("figs", "fig1_new.png"), fig1_new, ncol = 3, nrow = 3)
```





# old, not run

VIP scores from the PLS-DA model identify all 5 discriminating variables as being important for separating the groups (VIP > 1).  Using the VIP > 1 cutoff, it also spurriously identifies two of the other variables as being important.  Discriminating variables are not strongly correlated with the first two axes of the PCA.

```{r eval=FALSE, include=FALSE}
VIPs <- get_VIP(pls2) %>%
  arrange(desc(VIP))

pca.load <- get_loadings(pca2) %>%
  select(Variable, p1, p2) %>%
  arrange(desc(abs(p1)))
```

```{r eval=FALSE, include=FALSE}
table1 <- full_join(VIPs, pca.load) %>%
  arrange(Variable) %>% 
  rename(`PLS-DA VIP score` = VIP, `PC1 loading` = p1, `PC2 loading` = p2) %>% 
  mutate_if(is.double, ~round(.,4))
table1
write_excel_csv(table1, "figs/table1.csv")
```

# Red Herring
Large variation with weak discrimination plus small variation with large discrimination

```{r eval=FALSE, include=FALSE}
herring <- df %>% 
  sim_discr(p = 10, var = 1, cov = 0.5, group_means = c(-0.5, 0.5), name = "highvar") %>% 
  sim_discr(p = 10, var = 1, cov = 0, group_means = c(-1, 1), name = "lowvar") %>% 
  sim_covar(p = 5, var = 1, cov = 0, name = "noise")
```

```{r eval=FALSE, include=FALSE}
herring %>% 
  select(-group) %>% 
  cor() %>% 
  iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
```
```{r eval=FALSE, include=FALSE}
herring.cor <- herring %>% 
  select(-group) %>% 
  cor() %>%
  as.data.frame() %>% 
  rownames_to_column(var = "row") %>% 
  gather(-row, key = col, value = cor)

cor3 <- ggplot(herring.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top")
```


```{r eval=FALSE, include=FALSE}
pca3 <- opls(select(herring, -group), plot = FALSE)
pls3 <- opls(select(herring, -group), no_discr$group, plot = FALSE, predI = 2, permI = 500)
```
```{r eval=FALSE, include=FALSE}
pca3.plot <- plot_pca(pca3, herring$group, annotate = "subtitle")
pls3.plot <- plot_plsda(pls3, annotate = "subtitle")
plot_grid(cor3,
          pca3.plot + theme(legend.position = "NULL"),
          pls3.plot + theme(legend.position = "NULL"), nrow = 1, ncol = 3)
```

```{r eval=FALSE, include=FALSE}
VIPs3 <- get_VIP(pls3) %>%
  arrange(desc(VIP))

pca3.load <- get_loadings(pca3) %>%
  select(Variable, p1, p2) %>%
  arrange(desc(abs(p1)))

table3 <- full_join(VIPs3, pca3.load) %>%
  arrange(Variable) %>% 
  rename(`PLS-DA VIP score` = VIP, `PC1 loading` = p1, `PC2 loading` = p2) %>% 
  mutate_if(is.double, ~round(.,4))
table3 %>% arrange(desc(`PLS-DA VIP score`))
table3 %>% arrange(desc(`PC1 loading`))
# write_excel_csv(table1, "figs/table1.csv")

```






